#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))

Introduction

Check out this Kaggle

Get Data

a = read_csv('hotel_bookings.csv') %>%
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  )

5 min EDA

a %>% head
skimr::skim(a)
Data summary
Name a
Number of rows 119390
Number of columns 32
_______________________
Column type frequency:
Date 1
factor 13
numeric 18
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2014-10-17 2017-09-14 2016-08-07 926

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
agent 0 1 FALSE 334 9: 31961, NUL: 16340, 240: 13922, 1: 7191
arrival_date_month 0 1 FALSE 12 Aug: 13877, Jul: 12661, May: 11791, Oct: 11160
assigned_room_type 0 1 FALSE 12 A: 74053, D: 25322, E: 7806, F: 3751
company 0 1 FALSE 353 NUL: 112593, 40: 927, 223: 784, 67: 267
country 0 1 FALSE 178 PRT: 48590, GBR: 12129, FRA: 10415, ESP: 8568
customer_type 0 1 FALSE 4 Tra: 89613, Tra: 25124, Con: 4076, Gro: 577
deposit_type 0 1 FALSE 3 No : 104641, Non: 14587, Ref: 162
distribution_channel 0 1 FALSE 5 TA/: 97870, Dir: 14645, Cor: 6677, GDS: 193
hotel 0 1 FALSE 2 Cit: 79330, Res: 40060
market_segment 0 1 FALSE 8 Onl: 56477, Off: 24219, Gro: 19811, Dir: 12606
meal 0 1 FALSE 5 BB: 92310, HB: 14463, SC: 10650, Und: 1169
reservation_status 0 1 FALSE 3 Che: 75166, Can: 43017, No-: 1207
reserved_room_type 0 1 FALSE 10 A: 85994, D: 19201, E: 6535, F: 2897

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
adr 0 1 101.83 50.54 -6.38 69.29 94.58 126 5400 <U+2587><U+2581><U+2581><U+2581><U+2581>
adults 0 1 1.86 0.58 0.00 2.00 2.00 2 55 <U+2587><U+2581><U+2581><U+2581><U+2581>
arrival_date_day_of_month 0 1 15.80 8.78 1.00 8.00 16.00 23 31 <U+2587><U+2587><U+2587><U+2587><U+2586>
arrival_date_week_number 0 1 27.17 13.61 1.00 16.00 28.00 38 53 <U+2585><U+2587><U+2587><U+2587><U+2585>
arrival_date_year 0 1 2016.16 0.71 2015.00 2016.00 2016.00 2017 2017 <U+2583><U+2581><U+2587><U+2581><U+2586>
babies 0 1 0.01 0.10 0.00 0.00 0.00 0 10 <U+2587><U+2581><U+2581><U+2581><U+2581>
booking_changes 0 1 0.22 0.65 0.00 0.00 0.00 0 21 <U+2587><U+2581><U+2581><U+2581><U+2581>
children 4 1 0.10 0.40 0.00 0.00 0.00 0 10 <U+2587><U+2581><U+2581><U+2581><U+2581>
days_in_waiting_list 0 1 2.32 17.59 0.00 0.00 0.00 0 391 <U+2587><U+2581><U+2581><U+2581><U+2581>
is_canceled 0 1 0.37 0.48 0.00 0.00 0.00 1 1 <U+2587><U+2581><U+2581><U+2581><U+2585>
is_repeated_guest 0 1 0.03 0.18 0.00 0.00 0.00 0 1 <U+2587><U+2581><U+2581><U+2581><U+2581>
lead_time 0 1 104.01 106.86 0.00 18.00 69.00 160 737 <U+2587><U+2582><U+2581><U+2581><U+2581>
previous_bookings_not_canceled 0 1 0.14 1.50 0.00 0.00 0.00 0 72 <U+2587><U+2581><U+2581><U+2581><U+2581>
previous_cancellations 0 1 0.09 0.84 0.00 0.00 0.00 0 26 <U+2587><U+2581><U+2581><U+2581><U+2581>
required_car_parking_spaces 0 1 0.06 0.25 0.00 0.00 0.00 0 8 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_week_nights 0 1 2.50 1.91 0.00 1.00 2.00 3 50 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_weekend_nights 0 1 0.93 1.00 0.00 0.00 1.00 2 19 <U+2587><U+2581><U+2581><U+2581><U+2581>
total_of_special_requests 0 1 0.57 0.79 0.00 0.00 0.00 1 5 <U+2587><U+2581><U+2581><U+2581><U+2581>

clean data

# these numeric vars s/b factor vars
a = a %>% mutate_at(vars(arrival_date_day_of_month, arrival_date_week_number, arrival_date_year, is_canceled, is_repeated_guest), factor)

# reordering df
a = a %>% select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  )

EDA: datatime vars

range

a %>% select(where(is.Date))
a %>% pull(reservation_status_date) %>% range
## [1] "2014-10-17" "2017-09-14"

time series count graph – ungrouped

a %>% count(reservation_status_date, name = 'count') %>%
  arrange(reservation_status_date) %>%
  plot_ly(
    x = ~reservation_status_date,
    y = ~count
  ) %>% layout(
    title = 'booking counts by date',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

time series count graph – grouped

a %>% group_by(reservation_status_date, hotel) %>%
  summarise(count = n()) %>% 
  arrange(reservation_status_date) %>%
  plot_ly(
    x = ~reservation_status_date,
    y = ~count,
    color = ~hotel,
    alpha = 0.7
  ) %>% layout(
    title = 'booking counts by date/hotel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )
## `summarise()` regrouping output by 'reservation_status_date' (override with `.groups` argument)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

time series count graph – grouped

a %>% group_by(reservation_status_date, customer_type) %>%
  summarise(count = n()) %>% 
  arrange(reservation_status_date) %>%
  plot_ly(
    x = ~reservation_status_date,
    y = ~count,
    color = ~customer_type,
    alpha = 0.7
  ) %>% layout(
    title = 'booking counts by date/customer_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )
## `summarise()` regrouping output by 'reservation_status_date' (override with `.groups` argument)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

time series count graph – grouped

a %>% group_by(reservation_status_date, deposit_type) %>%
  summarise(count = n()) %>% 
  arrange(reservation_status_date) %>%
  plot_ly(
    x = ~reservation_status_date,
    y = ~count,
    color = ~deposit_type,
    alpha = 0.7
  ) %>% layout(
    title = 'booking counts by date/deposit_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )
## `summarise()` regrouping output by 'reservation_status_date' (override with `.groups` argument)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

time series count graph – grouped

a %>% group_by(reservation_status_date, distribution_channel) %>%
  summarise(count = n()) %>% 
  arrange(reservation_status_date) %>%
  plot_ly(
    x = ~reservation_status_date,
    y = ~count,
    color = ~distribution_channel,
    alpha = 0.7
  ) %>% layout(
    title = 'booking counts by date/distribution_channel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )
## `summarise()` regrouping output by 'reservation_status_date' (override with `.groups` argument)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

EDA: nom vars

sample data

a %>% select(where(is.factor)) %>% slice_sample(n = 10)

glimpse structure

a %>% select(where(is.factor)) %>% glimpse
## Rows: 119,390
## Columns: 18
## $ agent                     <fct> NULL, NULL, NULL, 304, 240, 240, NULL, 30...
## $ arrival_date_day_of_month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ arrival_date_month        <fct> July, July, July, July, July, July, July,...
## $ arrival_date_week_number  <fct> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...
## $ arrival_date_year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015,...
## $ assigned_room_type        <fct> C, C, C, A, A, A, C, C, A, D, E, D, E, G,...
## $ company                   <fct> NULL, NULL, NULL, NULL, NULL, NULL, NULL,...
## $ country                   <fct> PRT, PRT, GBR, GBR, GBR, GBR, PRT, PRT, P...
## $ customer_type             <fct> Transient, Transient, Transient, Transien...
## $ deposit_type              <fct> No Deposit, No Deposit, No Deposit, No De...
## $ distribution_channel      <fct> Direct, Direct, Direct, Corporate, TA/TO,...
## $ hotel                     <fct> Resort Hotel, Resort Hotel, Resort Hotel,...
## $ is_canceled               <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,...
## $ is_repeated_guest         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ market_segment            <fct> Direct, Direct, Direct, Corporate, Online...
## $ meal                      <fct> BB, BB, BB, BB, BB, BB, BB, FB, BB, HB, B...
## $ reservation_status        <fct> Check-Out, Check-Out, Check-Out, Check-Ou...
## $ reserved_room_type        <fct> C, C, A, A, A, A, C, C, A, D, E, D, D, G,...

check missing values

a %>% select(where(is.factor)) %>% miss_var_summary

distribution of level counts per factor

jpal = colorRampPalette(brewer.pal(8,'Dark2'))(15)

a %>% select(where(is.factor)) %>%
  map(n_unique) %>%
  as.tibble() %>%
  pivot_longer(everything()) %>%
  plot_ly(y = ~name, x = ~value, color = ~name, colors = jpal) %>%
  add_bars() %>%
  hide_legend() %>% 
  layout(
    title = 'distribution of level counts per factor',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

reference: names of unique levels

a %>% select(where(is.factor)) %>%
  map(unique)
## $agent
##   [1] NULL 304  240  303  15   241  8    250  115  5    175  134  156  243  242 
##  [16] 3    105  40   147  306  184  96   2    127  95   146  9    177  6    143 
##  [31] 244  149  167  300  171  305  67   196  152  142  261  104  36   26   29  
##  [46] 258  110  71   181  88   251  275  69   248  208  256  314  126  281  273 
##  [61] 253  185  330  334  328  326  321  324  313  38   155  68   335  308  332 
##  [76] 94   348  310  339  375  66   327  387  298  91   245  385  257  393  168 
##  [91] 405  249  315  75   128  307  11   436  1    201  183  223  368  336  291 
## [106] 464  411  481  10   154  468  410  390  440  495  492  493  434  57   531 
## [121] 420  483  526  472  429  16   446  34   78   139  252  270  47   114  301 
## [136] 193  182  135  350  195  352  355  159  363  384  360  331  367  64   406 
## [151] 163  414  333  427  431  430  426  438  433  418  441  282  432  72   450 
## [166] 180  454  455  59   451  254  358  469  165  467  510  337  476  502  527 
## [181] 479  508  535  302  497  187  13   7    27   14   22   17   28   42   20  
## [196] 19   45   37   61   39   21   24   41   50   30   54   52   12   44   31  
## [211] 83   32   63   60   55   56   89   87   118  86   85   210  214  129  179 
## [226] 138  174  170  153  93   151  119  35   173  58   53   133  79   235  192 
## [241] 191  236  162  215  157  287  132  234  98   77   103  107  262  220  121 
## [256] 205  378  23   296  290  229  33   286  276  425  484  323  403  219  394 
## [271] 509  111  423  4    70   82   81   74   92   99   90   112  117  106  148 
## [286] 158  144  211  213  216  232  150  267  227  247  278  280  285  289  269 
## [301] 295  265  288  122  294  325  341  344  346  359  283  364  370  371  25  
## [316] 141  391  397  416  404  299  197  73   354  444  408  461  388  453  459 
## [331] 474  475  480  449 
## 334 Levels: 1 10 103 104 105 106 107 11 110 111 112 114 115 117 118 119 ... NULL
## 
## $arrival_date_day_of_month
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
## 
## $arrival_date_month
##  [1] July      August    September October   November  December  January  
##  [8] February  March     April     May       June     
## 12 Levels: April August December February January July June March ... September
## 
## $arrival_date_week_number
##  [1] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [26] 52 53 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [51] 24 25 26
## 53 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 53
## 
## $arrival_date_year
## [1] 2015 2016 2017
## Levels: 2015 2016 2017
## 
## $assigned_room_type
##  [1] C A D E G F I B H P L K
## Levels: A B C D E F G H I K L P
## 
## $company
##   [1] NULL 110  113  270  178  240  154  144  307  268  59   204  312  318  94  
##  [16] 174  274  195  223  317  281  118  53   286  12   47   324  342  373  371 
##  [31] 383  86   82   218  88   31   397  392  405  331  367  20   83   416  51  
##  [46] 395  102  34   84   360  394  457  382  461  478  386  112  486  421  9   
##  [61] 308  135  224  504  269  356  498  390  513  203  263  477  521  169  515 
##  [76] 445  337  251  428  292  388  130  250  355  254  543  531  528  62   120 
##  [91] 42   81   116  530  103  39   16   92   61   501  165  291  290  43   325 
## [106] 192  108  200  465  287  297  490  482  207  282  437  225  329  272  28  
## [121] 77   338  72   246  319  146  159  380  323  511  407  278  80   403  399 
## [136] 14   137  343  346  347  349  289  351  353  54   99   358  361  362  366 
## [151] 372  365  277  109  377  379  22   378  330  364  401  232  255  384  167 
## [166] 212  514  391  400  376  402  396  302  398  6    370  369  409  168  104 
## [181] 408  413  148  10   333  419  415  424  425  423  422  435  439  442  448 
## [196] 443  454  444  52   459  458  456  460  447  470  466  484  184  485  32  
## [211] 487  491  494  193  516  496  499  29   78   520  507  506  512  126  64  
## [226] 242  518  523  539  534  436  525  541  40   455  410  45   38   49   48  
## [241] 67   68   65   91   37   8    179  209  219  221  227  153  186  253  202 
## [256] 216  275  233  280  309  321  93   316  85   107  350  279  334  348  150 
## [271] 73   385  418  197  450  452  115  46   76   96   100  105  101  122  11  
## [286] 139  142  127  143  140  149  163  160  180  238  183  222  185  217  215 
## [301] 213  237  230  234  35   245  158  258  259  260  411  257  271  18   106 
## [316] 210  273  71   284  301  305  293  264  311  304  313  288  320  314  332 
## [331] 341  352  243  368  393  132  220  412  420  426  417  429  433  446  357 
## [346] 479  483  489  229  481  497  451  492 
## 353 Levels: 10 100 101 102 103 104 105 106 107 108 109 11 110 112 113 ... NULL
## 
## $country
##   [1] PRT  GBR  USA  ESP  IRL  FRA  NULL ROU  NOR  OMN  ARG  POL  DEU  BEL  CHE 
##  [16] CN   GRC  ITA  NLD  DNK  RUS  SWE  AUS  EST  CZE  BRA  FIN  MOZ  BWA  LUX 
##  [31] SVN  ALB  IND  CHN  MEX  MAR  UKR  SMR  LVA  PRI  SRB  CHL  AUT  BLR  LTU 
##  [46] TUR  ZAF  AGO  ISR  CYM  ZMB  CPV  ZWE  DZA  KOR  CRI  HUN  ARE  TUN  JAM 
##  [61] HRV  HKG  IRN  GEO  AND  GIB  URY  JEY  CAF  CYP  COL  GGY  KWT  NGA  MDV 
##  [76] VEN  SVK  FJI  KAZ  PAK  IDN  LBN  PHL  SEN  SYC  AZE  BHR  NZL  THA  DOM 
##  [91] MKD  MYS  ARM  JPN  LKA  CUB  CMR  BIH  MUS  COM  SUR  UGA  BGR  CIV  JOR 
## [106] SYR  SGP  BDI  SAU  VNM  PLW  QAT  EGY  PER  MLT  MWI  ECU  MDG  ISL  UZB 
## [121] NPL  BHS  MAC  TGO  TWN  DJI  STP  KNA  ETH  IRQ  HND  RWA  KHM  MCO  BGD 
## [136] IMN  TJK  NIC  BEN  VGB  TZA  GAB  GHA  TMP  GLP  KEN  LIE  GNB  MNE  UMI 
## [151] MYT  FRO  MMR  PAN  BFA  LBY  MLI  NAM  BOL  PRY  BRB  ABW  AIA  SLV  DMA 
## [166] PYF  GUY  LCA  ATA  GTM  ASM  MRT  NCL  KIR  SDN  ATF  SLE  LAO 
## 178 Levels: ABW AGO AIA ALB AND ARE ARG ARM ASM ATA ATF AUS AUT AZE BDI ... ZWE
## 
## $customer_type
## [1] Transient       Contract        Transient-Party Group          
## Levels: Contract Group Transient Transient-Party
## 
## $deposit_type
## [1] No Deposit Refundable Non Refund
## Levels: No Deposit Non Refund Refundable
## 
## $distribution_channel
## [1] Direct    Corporate TA/TO     Undefined GDS      
## Levels: Corporate Direct GDS TA/TO Undefined
## 
## $hotel
## [1] Resort Hotel City Hotel  
## Levels: City Hotel Resort Hotel
## 
## $is_canceled
## [1] 0 1
## Levels: 0 1
## 
## $is_repeated_guest
## [1] 0 1
## Levels: 0 1
## 
## $market_segment
## [1] Direct        Corporate     Online TA     Offline TA/TO Complementary
## [6] Groups        Undefined     Aviation     
## 8 Levels: Aviation Complementary Corporate Direct Groups ... Undefined
## 
## $meal
## [1] BB        FB        HB        SC        Undefined
## Levels: BB FB HB SC Undefined
## 
## $reservation_status
## [1] Check-Out Canceled  No-Show  
## Levels: Canceled Check-Out No-Show
## 
## $reserved_room_type
##  [1] C A D E G F H L P B
## Levels: A B C D E F G H L P
a = a %>% mutate(arrival_date_month = factor(arrival_date_month, levels = c('January','February','March','April','May','June','July','August','September','October','November','December')))

EDA: num vars

check missing values

a %>% select(where(is.numeric)) %>% miss_var_summary

sample data

a %>% select(where(is.numeric)) %>% slice_sample(n = 10)

glimpse structure

a %>% select(where(is.numeric)) %>% glimpse
## Rows: 119,390
## Columns: 13
## $ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98....
## $ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 7...
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, ...
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, ...

viz: distribution histogram

a %>% select(where(is.numeric)) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)

viz: distribution density

a %>% select(where(is.numeric)) %>% DataExplorer::plot_density(nrow = 2, ncol = 1)

viz: distribution bivariate

a %>% select(hotel, where(is.numeric)) %>% DataExplorer::plot_boxplot(by = 'hotel', nrow = 3, ncol = 1)

## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

correlations: viz

a %>% select(where(is.numeric)) %>% dlookr::plot_correlate()

a %>% select(where(is.numeric)) %>% GGally::ggcorr(palette = "RdBu", label = TRUE)